Network Analysis


In [13]:
import networkx as nx
import igraph as ig
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [14]:
graph = ig.Graph.Read_Ncol('../omim/disease.net.w.txt', weights=True, names=True)
graph = graph.as_undirected(mode='collapse')

Creating a graph from scratch


In [15]:
g = ig.Graph()
g.add_vertices([0,1,2,3,4])
g.add_edges([(0,1), (1,2), (2,3), (2,4)])
nx.draw(nx.from_edgelist(g.get_edgelist()))


Generating graphs

Erdos-Renyi random graph


In [16]:
er = ig.Graph.Erdos_Renyi(100, 0.1)
nx.draw(nx.from_edgelist(er.get_edgelist()))


Geometric random graph


In [17]:
grg = ig.Graph.GRG(100, 0.2)
nx.draw(nx.from_edgelist(grg.get_edgelist()))


Barabasi-Albert preferential attachment for random graphs


In [18]:
ba = ig.Graph.Barabasi(100)
nx.draw(nx.from_edgelist(ba.get_edgelist()))



In [19]:
# power-law degree distribution
from collections import Counter
ba_max = ig.Graph.Barabasi(100000)
cnt = Counter(ba_max.degree()).items()
deg_cnt = sorted(cnt)
degs, cnts = list(zip(*deg_cnt))
freq_cnts = np.array(cnts)/np.sum(cnts)
plt.loglog(degs, freq_cnts, basex=2);


Structural properties of a Human Disease Network


In [20]:
graph = ig.Graph.Read_Ncol('../omim/disease.net.w.txt', weights=True, names=True)
graph = graph.as_undirected(mode='collapse')

In [21]:
print('Nodes: %d' % graph.vcount())
print('Edges: %d' % graph.ecount())
c = graph.components(mode='strong')
giant = c.giant()
print('Strong conn. comp: %d' % len(c.sizes()))
print('Giant component: %d' %  giant.vcount())
print('Closed triangles: %d' % graph.as_directed().triad_census().t300)
print('Diameter: %d' % giant.diameter())
print('Clustering: %3.3f' % graph.transitivity_avglocal_undirected())


Nodes: 867
Edges: 1527
Strong conn. comp: 123
Giant component: 516
Closed triangles: 1517
Diameter: 15
Clustering: 0.805

Graph layout algorithms and visualization


In [22]:
nx_graph = nx.from_edgelist(graph.get_edgelist())
spring_pos = nx.spring_layout(nx_graph)
nx.draw(nx_graph, pos=spring_pos, node_size=3, node_color='black', edge_color='black', width=1)



In [23]:
f = open('../omim/disease_names.txt')
did2name = dict([line.strip().split('\t') for line in f])
f.close()
graph.vs['disease'] = [did2name[did] for did in graph.vs['name']]

In [24]:
f = open('../omim/disease_classes.txt')
did2class = dict([line.strip().split('\t') for line in f])
f.close()

dclasses = set(did2class.values())
graph.vs['dclass'] = [did2class[did] for did in graph.vs['name']]

c = ['orchid', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'red', 'rosybrown',
'royalblue', 'orange', 'salmon', 'silver', 'skyblue', 'slategray', 'springgreen', 
'steelblue', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'yellow']
class2color = {dclass : c[i] for i, dclass in enumerate(dclasses)}
colors = [class2color[n['dclass']] for n in graph.vs]
print('Disease classes:\n %s' % '\n '.join('%12s %s' % (class2color[dclass], dclass) for dclass in dclasses))

nx_graph = nx.from_edgelist(graph.get_edgelist())
spring_pos = nx.spring_layout(nx_graph)
nx.draw(nx_graph, pos=spring_pos, node_size=30, node_color=colors, edge_color='black', width=1)


Disease classes:
       orchid Metabolic
         peru Hematological
         pink Gastrointestinal
         plum Dermatological
   powderblue Developmental
       purple Nutritional
          red multiple
    rosybrown Neurological
    royalblue Unclassified
       orange Muscular
       salmon Ear,Nose,Throat
       silver Skeletal
      skyblue Respiratory
    slategray Cancer
  springgreen Cardiovascular
    steelblue Bone
       tomato Immunological
    turquoise Ophthamological
       violet Connective tissue disorder
        wheat Psychiatric
        white Endocrine
       yellow Renal

The Cancer subnetwork


In [25]:
# Keyword arguments can be used to filter the vertices based on their attributes or their structural properties. 
eye_nodes = graph.vs.select(dclass='Ophthamological')
print('Eye-related diseases: %d' % len(eye_nodes))

subgraph = graph.subgraph(eye_nodes)
labels = dict(enumerate(subgraph.vs['disease']))

nx_graph = nx.Graph()
nx_graph.add_nodes_from(labels.keys())
nx_graph.add_edges_from(subgraph.get_edgelist())
print('Nodes: %d Edges: %d' % (len(nx_graph.nodes()), len(nx_graph.edges())))

spring_pos = nx.spring_layout(nx_graph)
nx.draw(nx_graph, pos=spring_pos, labels=labels, with_labels=False)


Eye-related diseases: 48
Nodes: 48 Edges: 75

Node-centric analysis


In [26]:
# Finding a single node with some properties
leukemia = graph.vs.find(disease="Leukemia")
leukemia_nghbs = leukemia.neighbors()
print('Neighboring nodes:\n %s' % '\n '.join(nghb['disease'] for nghb in leukemia_nghbs))

leukemia_paths = np.squeeze(np.array(leukemia.shortest_paths()))
# Nodes that are close to the ``leukemia'' in terms of their shortest path distance
nearby_idx = np.nonzero(leukemia_paths < 2)[0]
print()
print('Nearby nodes:\n %s' % '\n '.join(graph.vs[i]['disease'] for i in nearby_idx[:10]))

# Nodes that are far away from the ``leukemia'' in terms of their shortest path distance
distant_idx = np.nonzero(np.isinf(leukemia_paths))[0]
print()
print('Distant nodes:\n %s' % '\n '.join(graph.vs[i]['disease'] for i in distant_idx[:10]))


Neighboring nodes:
 Bladder cancer
 Colorectal cancer with chromosomal instability (3)
 Breast-ovarian cancer (3)
 Lipomatosis
 Platelet glycoprotein IV deficiency
 Rheumatoid arthritis
 Benzene toxicity
 Lung cancer
 Pancreatic carcinoma
 Stomach cancer
 Neurofibromatosis
 von Hippel-Lindau syndrome
 Germ cell tumors
 Dyserythropoietic anemia with thrombocytopenia
 Macrothrombocytopenia
 Gastrointestinal stromal tumor
 Mast cell leukemia (3)
 Mastocytosis with associated hematologic disorder (3)
 Piebaldism (3)
 Growth hormone insensitivity with immunodeficiency
 Leopard syndrome
 Noonan syndrome 1
 Nijmegen breakage syndrome
 Neurofibromatosis
 Neurofibromatosis-Noonan syndrome
 Watson syndrome

Nearby nodes:
 Bladder cancer
 Colorectal cancer with chromosomal instability (3)
 Breast-ovarian cancer (3)
 Lipomatosis
 Platelet glycoprotein IV deficiency
 Rheumatoid arthritis
 Benzene toxicity
 Leukemia
 Lung cancer
 Pancreatic carcinoma

Distant nodes:
 Abacavir hypersensitivity
 Ankylosing spoldylitis
 Stevens-Johnson syndrome
 Acampomelic campolelic dysplasia
 Campomelic dysplasia with autosomal sex reversal
 Acrocallosal syndrome
 Greig cephalopolysyndactyly syndrome
 Pallister-Hall syndrome
 Polydactyly
 Acrocapitofemoral dysplasia

Network centrality measures

Node degrees


In [27]:
deg_dis = sorted([(n.degree(), n['disease']) for n in graph.vs], reverse=True)
print('Max deg.:\n%s' %  '\n'.join('%d %s' % (deg, dis) for deg, dis in deg_dis[:10]))


Max deg.:
50 Colorectal cancer with chromosomal instability (3)
30 Breast-ovarian cancer (3)
27 Gastric cancer
26 Thyroid carcinoma
26 Leukemia
25 Deafness
24 Diabetic retinopathy
23 Pancreatic carcinoma
20 Prostate cancer
16 Retinitis punctata albescens

Node betweenness


In [28]:
nbs = graph.betweenness()
bs_dis = sorted([(bs, n['disease']) for bs, n in zip(nbs, graph.vs)], reverse=True)
print('Betweenness:\n%s' % '\n'.join('%4.3f %s' % (bs, dis) for bs, dis in bs_dis[:10]))


Betweenness:
62124.033 Cardiomyopathy
54197.921 Lipodystrophy
36821.471 Diabetic retinopathy
31406.331 Glioblastoma
26509.999 Deafness
23255.667 Myopathy
22276.000 Cataract
17626.775 Colorectal cancer with chromosomal instability (3)
16394.233 Leukemia
15368.879 Alzheimer disease

In [29]:
bs_id = sorted([(bs, i) for i, bs in enumerate(nbs)], reverse=True)
ids = list(zip(*bs_id))[1]
subgraph = graph.subgraph(ids[:7])
labels = dict(enumerate(subgraph.vs['disease']))

nx_graph = nx.Graph()
nx_graph.add_nodes_from(labels.keys())
nx_graph.add_edges_from(subgraph.get_edgelist())

spring_pos = nx.spring_layout(nx_graph)
nx.draw(nx_graph, pos=spring_pos, labels=labels, with_labels=True)


Google Pagerank


In [30]:
prs = graph.pagerank()
pr_dis = sorted([(pr, i) for i, pr in enumerate(prs)], reverse=True)
print('Pagerank:\n%s' % '\n'.join('%4.3f %s' % (pr, graph.vs[i]['disease']) for pr, i in pr_dis[:10]))


Pagerank:
0.008 Colorectal cancer with chromosomal instability (3)
0.007 Deafness
0.006 Diabetic retinopathy
0.005 Leukemia
0.005 Breast-ovarian cancer (3)
0.005 Retinitis punctata albescens
0.004 Cardiomyopathy
0.004 Gastric cancer
0.004 Thyroid carcinoma
0.004 Mental retardation

In [31]:
top_k = 10
top_ids = [i for _, i in pr_dis[:top_k]]

nx_graph = nx.from_edgelist(graph.get_edgelist())

colors = ['red' if i in top_ids else 'black' for i, _ in enumerate(prs)]
node_size = [10 + 20 * (top_k - top_ids.index(i)) if i in top_ids else 10 for i, _ in enumerate(prs)]

spring_pos = nx.spring_layout(nx_graph)
nx.draw(nx_graph, pos=spring_pos, node_color=colors, edge_color='black', node_size=node_size, width=1)


Network community detection using Walktrap algorithm


In [32]:
c = graph.components(mode='strong')
giant = c.giant()
rndwlk = giant.community_walktrap(steps=3).as_clustering()
mmbr_rndwlk = rndwlk.membership

n_cmt = len(set(mmbr_rndwlk))
cnt = Counter(mmbr_rndwlk)
print('C: %5.2f S: %5.2f' % (n_cmt, sum(cnt.values()) / n_cmt))

# Modularity is one measure of the structure of networks or graphs. 
# It was designed to measure the strength of division of a network 
# into modules. 
# Networks with high modularity have dense connections between the nodes 
# within modules but sparse connections between nodes in different modules. 

# Modularity of S is the difference between mS, the number of edges between nodes in S, 
# and E(mS), the expected number of such edges in a random graph with identical degree 
# sequence. The value of the modularity lies in the range [−1/2,1). It is positive if 
# the number of edges within groups exceeds the number expected on the basis of chance. 
print('Modularity (random walks): %3.3f' % rndwlk.modularity)


C: 36.00 S: 14.33
Modularity (random walks): 0.791

In [33]:
nx_graph = nx.Graph()
nx_graph.add_edges_from(giant.get_edgelist())

spring_pos = nx.fruchterman_reingold_layout(nx_graph)
cmt2col = {cmt: np.random.rand(3) for cmt in set(mmbr_rndwlk)}
colors = [cmt2col[mmbr_rndwlk[i]] for i in range(giant.vcount())]
nx.draw(nx_graph, pos=spring_pos, node_color=colors, node_size=20, linewidths=0)
plt.savefig('../omim/omim-walktrap.pdf')


Network community detection with label propagation algorithm


In [34]:
c = graph.components(mode='strong')
giant = c.giant()
lblprp = giant.community_label_propagation()
mmbr_lblprp = lblprp.membership	

n_cmt = len(set(mmbr_lblprp))
cnt = Counter(mmbr_lblprp)
print('C: %5.2f S: %5.2f' % (n_cmt, sum(cnt.values())/n_cmt))
print('Modularity (label propagation): %3.3f' % lblprp.modularity)


C: 60.00 S:  8.60
Modularity (label propagation): 0.761

In [35]:
nx_graph = nx.Graph()
nx_graph.add_edges_from(giant.get_edgelist())

spring_pos = nx.fruchterman_reingold_layout(nx_graph)
cmt2col = {cmt: np.random.rand(3) for cmt in set(mmbr_lblprp)}
colors = [cmt2col[mmbr_lblprp[i]] for i in range(giant.vcount())]
nx.draw(nx_graph, pos=spring_pos, node_color=colors, node_size=20, linewidths=0)
plt.savefig('../omim/omim-labprop.pdf')


Network community detection with Infomap algorithm


In [36]:
c = graph.components(mode='strong')
giant = c.giant()
infomap = giant.community_infomap()
mmbr_infomap = infomap.membership

n_cmt = len(set(mmbr_infomap))
cnt = Counter(mmbr_infomap)
print('C: %5.2f S: %5.2f' % (n_cmt, sum(cnt.values())/n_cmt))
print('Modularity (Infomap): %3.3f' % infomap.modularity)
print('Codelength (Infomap): %3.3f' % infomap.codelength)


C: 57.00 S:  9.05
Modularity (Infomap): 0.784
Codelength (Infomap): 5.670

In [37]:
nx_graph = nx.Graph()
nx_graph.add_edges_from(giant.get_edgelist())

spring_pos = nx.fruchterman_reingold_layout(nx_graph)
cmt2col = {cmt: np.random.rand(3) for cmt in set(mmbr_infomap)}
colors = [cmt2col[mmbr_infomap[i]] for i in range(giant.vcount())]
nx.draw(nx_graph, pos=spring_pos, node_color=colors, node_size=20, linewidths=0)
plt.savefig('../omim/omim-infomap.pdf')


igraph, networkx and the outside world


In [38]:
# Write the OMIM network in a format for explorative data analysis in Gephi

# Labeled edgelist
graph.write_graphml('../omim/omim_gephi.graphml')

# Now go and explore the network with Gephi (http://gephi.github.io/users/download/)!